Title analysis: titles of male and female speakers

Title analysis

data.tit <- read.table("data/presentations_PPGE_2008-2019.csv", sep=",",
                   header=T, as.is=T)
data.tit$date <- dmy(data.tit$date)
data.tit$year <- year(data.tit$date) 
#skimr::skim(data.tit)

Excluding special events as round tables and discussions not related to a project or study presented by someone.

IDs <- c(154, 250, 211, 289)
data.tit <- data.tit %>% filter(!id %in% IDs) %>% filter(!is.na(title_english))
table(data.tit$gender)
## 
##   F   M 
## 144 185
table(data.tit$position_cat, data.tit$gender)
##            
##              F  M
##   others     4  1
##   postdoc   25 32
##   professor 25 75
##   student   88 75

Formating tidytext

tit <- data.tit %>% dplyr::select(id,gender,position_cat, audience_n,
                              title_english) 
text_tok <- tit %>% unnest_tokens(output=word,
                                   input=title_english)
  • Excluding stopwords, e.g. “and” “or” “the” “of” “in”.

  • Standardizing plurals.

# lista das stopwords em ingles
stop_w <- tibble(word = stopwords(source = "stopwords-iso"))

#retirar do corpus as stopwords
text <- text_tok %>% 
  anti_join(stop_w, by="word") 

# retirar nĂșmeros e travessĂŁo e outras word
remover <- c("ăƒŒ", "1", "1st", "2", "364", "40", "70", "750", "aff", "da")

text <- text %>% filter(!word %in% remover )

# resolvendo plurais simples - sĂł cortando o S
plural <- c("actions","advances", "adaptations", "amphibians", "animals", "ants","anurans",
            "applications","approaches", "bees","builds", "birds",
            "cerrados","challenges",
            "continents","crops", 
            "decisions","declines","determines","determinants", "defenses",
            "dynamics",
            "economics", "ecosystems","environments", "experiences",
            "forests",
            "genetics","gifts","gradients","guides","impacts",
            "increases","interactions","lives",
            "landscapes","males","mammals", "mangroves","models","movements",
            "mutualisms","networks","neotropics",
            "opilions","phenotypes","plants","projects","paths", "perspectives",
            "populations","promotes","relationships", "relations",
            "resources","responses","roads","services","skulls","snakes","seeds",
            "spaces", "spiders","stages", "trees", "variations",
            "threats")

text$word[text$word %in% plural] <- 
  substr(text$word[text$word %in% plural],
       1,nchar(text$word[text$word %in% plural])-1)
  • Grouping similar words:
lemma <- rbind(c("adaptive", "adaptation"),
               c("advancement", "advance"),
               c("agricultural", "agriculture"),
               c("agro", "agriculture" ),
               c("amazonia","amazon" ),
               c("amazonian","amazon" ),
               c("andean","andes"),
               c("apply","application"),
               c("applying","application"),
               c("apidae","apis"),
               c("arachnida","arachnid"),
               c("argue","argument"),
               c("basal", "basis"),
               c("behavioral","behavior"),
               c("behavioural","behavior"),
               c("bignonieae", "bignoniaceae"),
               c("biological", "biology"),
               c("brazilian","brazil"),
               c("building","build"),
               c("changing", "change"),
               c("cnidarian", "cnidaria"),
               c("coastal","coast"),
               c("colour", "color"),
               c("colors", "color"),
               c("communities","community" ),
               c("competitive", "competition"),
               c("complexity", "complex"),
               c("convergences", "convergence"),
               c("convergent", "convergence"),
               c("cordatus","cordata.tit" ),
               c("croplands","crop"),
               c( "cultural", "culture"),
               c("darwin's", "darwin"),
               c("darwinian", "darwin"),
               c("defensive", "defense"),
               c("dependent","dependence"),
               c("detecting","detection"),
               c("determine", "determinant"),
               c("developmental", "development"),
               c("dispersers","dispersal"),
               c("disturbed", "disturbance"),
               c("diversification", "diversity"),
               c("dragonflies", "dragonfly"),
               c("drier", "drought"),
               c("ecological", "ecology"),
               c("ecologists", "ecology"),
               c("endemic", "endemism"),
               c("effectiveness", "efficiency"),
               c("environmental", "environment"),
               c("evolutionary", "evolution"),
               c("expanding", "expansion"),
               c("extinct", "extinction"),
               c("facilitate", "facilitation"),
               c("fisheries", "fishery"),
               c("floral", "flora"),
               c("floristic", "flora"),
               c("forested", "forest"),
               c("functional", "function"),
               c("functionally", "function"),
               c("functioning", "function"),
               c("geographical", "geographic"),
               c("heterogeneties", "heterogeneity"),
               c("heterogeneous", "heterogeneity"),
               c("histories", "history"),
               c("integrated", "integration"),
               c("intregating", "integration"),
               c("integrative", "integration"),
               c("invasive", "invasion"),
               c("isotopic", "isotope"),
               c("linking", "link"),
               c("living", "live"),
               c("mammalia", "mammal"),
               c("managed", "manage"),
               c("managers", "manage"),
               c("mathematical", "mathematics"),
               c("mates", "mating"),
               c("mediated", "mediate"),
               c("mechanistic", "mechanism"),
               c("matrices", "matrix"),
               c("migratory", "migration"),
               c("mimicking", "mimicry"),
               c("modeling", "model"),
               c("mutualistic", "mutualism"),
               c("natural", "nature"),
               c("neotropical", "neotropic"),
               c("northeastern", "northeast"),
               c("occuring", "occur"),
               c("onça", "onca"),
               c("opiliones", "opilion"),
               c("parasite", "parasitism"),
               c("parent", "parenting"),
               c("phylogenies", "phylogeny"),
               c("phylogenetic", "phylogeny"),
               c("phylogenomic", "phylogeny"),
               c("pollinators", "pollination"),
               c("protected", "protect"),
               c("protective", "protect"),
               c("rainfall", "rain"),
               c("reconstructing", "reconstruction"),
               c("regulatory", "regulation"),
               c("regulates", "regulation"),
               c("relation", "relationship"),
               c("reproductive", "reproduction"),
               c("restored", "restoration"),
               c("robustness", "robust"),
               c("scientific", "science"),
               c("scientist", "science"),
               c("sexy", "sexual"),
               c("simulated", "simulation"),
               c("societies", "society"),
               c("social", "society"),
               c("socio", "society"),
               c("space", "spatial"),
               c("spacio", "spatial"),
               c("stabilize", "stability"),
               c("stable", "stability"),
               c("stories", "story"),
               c("strategic", "strategy"),
               c("strategies", "strategy"),
               c("structured", "structure"),
               c("structuring", "structure"),
               c("studies", "study"),
               c("studing", "study"),
               c("sustainable", "sustainability"),
               c("theories", "theory"),
               c("theoretical", "theory"),
               c("threatened", "threat"),
               c("tropical", "tropic"),
               c("vision", "visual")
               )
lemma <- as.data.frame(lemma)

for (i in 1:dim(lemma)[1]){
  text$word[text$word == lemma[i,1]] <- lemma[i,2]
}

Counting words Frequency by gender

Removing stopwords, we keep 2340 words.

table(text$gender)
## 
##    F    M 
## 1082 1258
table(text$position_cat, text$gender)
##            
##               F   M
##   others     16  10
##   postdoc   179 230
##   professor 172 452
##   student   703 554
pala <- text %>%
  count(word) 

20 palavra mais comuns

text %>%
  count(word, sort = TRUE) %>% 
  top_n(20,n)%>%
  kable()
word n
ecology 50
forest 42
evolution 32
landscape 27
bird 22
model 22
diversity 21
environment 21
species 21
plant 18
structure 17
atlantic 15
brazil 15
effects 15
conservation 14
interaction 13
study 13
bee 12
community 12
network 12
patterns 12
sĂŁo 12

word cloud

textplot_wordcloud(x=dfm(tokens(text$word)))

par(mfrow=c(1,2))
textplot_wordcloud(x=dfm(tokens(text$word[text$gender=="F"])),
                   col="#6D57CF")
par(new=T)
textplot_wordcloud(x=dfm(tokens(text$word[text$gender=="M"])),
                   col="#FCA532")

Word frequencies by gender

props <- text %>%
  count(gender, word) %>%
  group_by(gender) %>%
  mutate(proportion = n / sum(n)) %>% 
  pivot_wider(names_from = gender, values_from = c(proportion,n)) %>%
  mutate(abs.dif.p = abs(proportion_F-proportion_M),
         rel.dif.p = pmax(proportion_F, proportion_M)/
           pmin(proportion_F, proportion_M)) %>%
  arrange(desc(abs.dif.p))
props$label <- NA
props$label[1:20] <- props$word[1:20]
ggplot(props, aes(x=proportion_M,, y=proportion_F,
       color=abs.dif.p)) + 
  geom_abline(color = "gray40", lty = 2) +
  #geom_point(size=2.5, alpha=0.5)+
  geom_jitter(size=2.5, alpha=0.5)+
  geom_text_repel(aes(label=label), size=3)+
  scale_x_log10(name="Male most used words",
                labels = percent_format(), limits=c(0.0005,0.03)) +
  scale_y_log10(name="Female Most used words",
                labels = percent_format(),limits=c(0.0005,0.03)) +
  scale_color_gradient(name="Abs Diff",low = "blue", high = "red",
                       labels=percent_format())  +
  theme(legend.justification = c(1, -0.1), legend.position = c(1, 0))

 # geom_smooth(method="lm")
ggsave("figures/title_wordFreq.jpg", height = 5, width=7)

Words that are close to the dashed line in these plots have similar frequencies in both genders. Words that are far from the line are words that are found more in one set of texts than another.

Legend: absolute differences in the frequency of the word by males and females. Top 20 words by absolute differences are also indicated in text.

Correlation of word frequeency use between gender:

cor.test(props$proportion_F, props$proportion_M)
## 
##  Pearson's product-moment correlation
## 
## data:  props$proportion_F and props$proportion_M
## t = 15.393, df = 236, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.6380756 0.7660220
## sample estimates:
##       cor 
## 0.7078068

Highly correlated -> it means they tend to use the same frequency of main words

prop2 <- props %>% filter(!is.na(label)) %>%
  arrange(desc(proportion_F), desc(proportion_M)) %>%
  mutate(ntot = n_F + n_M) %>%
  mutate(word = fct_reorder(word,(ntot),max),
         proportion_F = proportion_F*-1) %>%
  pivot_longer(2:3,names_to = "gender", values_to ="proportion")

ggplot(prop2, aes(x=proportion, y=word,fill=gender)) +
  geom_col()+ ylab("") + xlab("Proportion")+
  scale_fill_manual(name="gender", values=c("#6D57CF","#FCA532"),
                    labels=c("F", "M"))+
   geom_vline(xintercept = c(-0.02,-0.01,0,0.01,0.02),
              linetype="dotted",
             col="darkgray") +
  scale_x_continuous(breaks=c(-0.02,-0.01,0,0.01,0.02),
                     labels = c(0.02,0.01,0,0.01,0.02))

ggsave("figures/title_wordFreq_barplot.jpeg", units="in", width=7, height=7, dpi=300)

PROFESSOR Word frequencies by gender

propsP <- text %>% filter(position_cat == "professor")%>%
  count(gender, word) %>%
  group_by(gender) %>%
  mutate(proportion = n / sum(n)) %>% 
  pivot_wider(names_from = gender, values_from = c(proportion,n)) %>%
  mutate(abs.dif.p = abs(proportion_F-proportion_M),
         rel.dif.p = pmax(proportion_F, proportion_M)/
           pmin(proportion_F, proportion_M)) %>%
  arrange(desc(abs.dif.p))
propsP$label <- NA
propsP$label[1:20] <- propsP$word[1:20]
ggplot(propsP, aes(x=proportion_M, y=proportion_F,
       color=abs.dif.p)) + 
  geom_abline(color = "gray40", lty = 2) +
 # geom_point(size=2.5, alpha=0.3) +
  geom_jitter(size=2.5, alpha=0.3)+
  geom_text_repel(aes(label=label), size=3)+
  scale_x_log10(name="Male most used words",
                labels = percent_format()) +
  scale_y_log10(name="Female Most used words",
                labels = percent_format()) +
  scale_color_gradient(name="Abs Diff",low = "blue", high = "red",
                       labels=percent_format())  +
  theme(legend.justification = c(1, -0.1), legend.position = c(1, 0))

 # geom_smooth(method="lm")
ggsave("figures/abstract_wordFreq_Prof.jpg", height = 5, width=7)

Words that are close to the dashed line in these plots have similar frequencies in both genders. Words that are far from the line are words that are found more in one set of texts than another.

Legend: absolute differences in the frequency of the word by males and females. Differences above 0.3% are also indicated in text.

Correlation of word frequeency use between gender:

cor.test(propsP$proportion_F, propsP$proportion_M)
## 
##  Pearson's product-moment correlation
## 
## data:  propsP$proportion_F and propsP$proportion_M
## t = 1.3985, df = 47, p-value = 0.1685
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.0861594  0.4554761
## sample estimates:
##       cor 
## 0.1998798

No correlation

20 words with the largest differences in frequency

propP2 <- propsP %>% filter(!is.na(label)) %>%
  arrange(desc(proportion_F), desc(proportion_M)) %>%
  mutate(ntot = n_F + n_M) %>%
  mutate(word = fct_reorder(word,(ntot),max),
         proportion_F = proportion_F*-1) %>%
  pivot_longer(2:3,names_to = "gender", values_to ="proportion")

ggplot(propP2, aes(x=proportion, y=word,fill=gender)) +
  geom_col()+ ylab("") + xlab("Proportion")+
  scale_fill_manual(name="gender", values=c("#6D57CF","#FCA532"),
                    labels=c("F", "M"))+
   geom_vline(xintercept = c(-0.03,-0.02,-0.01,0,0.01,0.02,0.03),
              linetype="dotted",
             col="darkgray") +
  scale_x_continuous(breaks=c(-0.03,-0.02,-0.01,0,0.01,0.02,0.03),
                     labels = c(0.03,0.02,-0.01,0,0.01,0.02,0.03))

ggsave("figures/title_wordFreq_barplot_Prof.jpeg", units="in", width=7, height=7, dpi=300)

TF IDF

OBS: essas ana’lises nao ajudaram muito, tlvz nem precisem mais ficar aqui

text_id <- text %>% count(gender, word) %>% 
  bind_tf_idf(word, gender, n) %>%
  arrange(desc(tf_idf))
#text_id
text_id$word <- as.factor(text_id$word)
text_id %>%
  group_by(gender) %>% 
  arrange(desc(tf_idf)) %>% 
  top_n(5, tf_idf) %>%  
  ggplot(aes(x = tf_idf, y = reorder(word, tf_idf), fill = gender)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~gender, scales = "free") +
  theme_minimal()

TF IDF professors

OBS: essas ana’lises nao ajudaram muito, tlvz nem precisem mais ficar aqui

text_idP <- text %>% filter(position_cat== "professor") %>% 
  count(gender, word) %>% 
  bind_tf_idf(word, gender, n) %>%
  arrange(desc(tf_idf))
#text_id
text_idP$word <- as.factor(text_idP$word)
text_idP %>%
  group_by(gender) %>% 
  arrange(desc(tf_idf)) %>% 
  top_n(5, tf_idf) %>%  
  ggplot(aes(x = tf_idf, y = reorder(word, tf_idf), fill = gender)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~gender, scales = "free") +
  theme_minimal()

Topic model

matext <- text %>% count(id, gender, word) %>% mutate(id = paste(id, gender, sep="_")) %>%
  select(-gender) %>%
                cast_dtm(term=word,document=id,value=n)

ap_lda2 <- LDA(matext, k = 2, control = list(seed = 1234))
ap_lda3 <- LDA(matext, k = 3, control = list(seed = 1234))
ap_lda4 <- LDA(matext, k = 4, control = list(seed = 1234))
bbmle::AICtab(ap_lda2, ap_lda3, ap_lda4,base=T)
##         AIC     dAIC    df  
## ap_lda3 34358.6     0.0 3184
## ap_lda2 34405.8    47.2 2123
## ap_lda4 35971.6  1613.0 4245

word-topic probabilities

ap_topics <- tidy(ap_lda2, matrix = "beta")
ap_top_terms <- ap_topics %>%
      group_by(topic) %>%
      top_n(10, beta) %>%
      ungroup() %>%
      arrange(topic, -beta)
ap_top_terms %>%
mutate(term = reorder(term, beta)) %>% ggplot(aes(term, beta, fill = factor(topic))) + geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") + coord_flip()

Document-topic probabilities

ap_documents <- tidy(ap_lda2, matrix = "gamma")
classifi <- ap_documents %>% mutate(gender = substr(document, nchar(document), nchar(document))) %>%
  group_by(document,gender) %>%
      top_n(1, gamma) 

table(classifi$gender, classifi$topic)
##    
##      1  2
##   F 66 71
##   M 85 96
classifi %>% tabyl(gender, topic) %>% adorn_percentages() %>% 
  adorn_pct_formatting(digits = 0) 
##  gender   1   2
##       F 48% 52%
##       M 47% 53%
classifi %>%
 # mutate(title = reorder(title, gamma * topic)) %>%
  ggplot(aes(as.character(topic), gamma)) +
  geom_boxplot() +
  facet_wrap(~ gender)

Topic model Professors only

matext <- text %>% filter(position_cat=="professor") %>%
  count(id, gender, word) %>% mutate(id = paste(id, gender, sep="_")) %>%
  select(-gender) %>%
                cast_dtm(term=word,document=id,value=n)

ap_lda2 <- LDA(matext, k = 2, control = list(seed = 1234))
ap_lda3 <- LDA(matext, k = 3, control = list(seed = 1234))
ap_lda4 <- LDA(matext, k = 4, control = list(seed = 1234))
bbmle::AICtab(ap_lda2, ap_lda3, ap_lda4,base=T)
##         AIC    dAIC   df  
## ap_lda2 8798.6    0.0 807 
## ap_lda3 8825.7   27.0 1210
## ap_lda4 9444.3  645.7 1613

word-topic probabilities

ap_topics <- tidy(ap_lda2, matrix = "beta")
ap_top_terms <- ap_topics %>%
      group_by(topic) %>%
      top_n(10, beta) %>%
      ungroup() %>%
      arrange(topic, -beta)
ap_top_terms %>%
mutate(term = reorder(term, beta)) %>% ggplot(aes(term, beta, fill = factor(topic))) + geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") + coord_flip()

Document-topic probabilities

ap_documents <- tidy(ap_lda2, matrix = "gamma")
classifi <- ap_documents %>% mutate(gender = substr(document, nchar(document), nchar(document))) %>%
  group_by(document,gender) %>%
      top_n(1, gamma) 

table(classifi$gender, classifi$topic)
##    
##      1  2
##   F 10 15
##   M 34 40
classifi %>% tabyl(gender, topic) %>% adorn_percentages() %>% 
  adorn_pct_formatting(digits = 0) 
##  gender   1   2
##       F 40% 60%
##       M 46% 54%
classifi %>%
 # mutate(title = reorder(title, gamma * topic)) %>%
  ggplot(aes(as.character(topic), gamma)) +
  geom_boxplot() +
  geom_violin()+
  facet_wrap(~ gender)

ABSTRAC - semitniment analysis

Chapter 2, Silge & RObinson. 2018

  • The NRC lexi‐ con categorizes words in a binary fashion (“yes”/“no”) into categories of positive, negative, anger, anticipation, disgust, fear, joy, sadness, surprise, and trust.
get_sentiments("nrc")
## # A tibble: 13,875 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 abacus      trust    
##  2 abandon     fear     
##  3 abandon     negative 
##  4 abandon     sadness  
##  5 abandoned   anger    
##  6 abandoned   fear     
##  7 abandoned   negative 
##  8 abandoned   sadness  
##  9 abandonment anger    
## 10 abandonment fear     
## # 
 with 13,865 more rows
  • The Bing lexicon categorizes words in a binary fashion into positive and negative categories.
get_sentiments("bing")
## # A tibble: 6,786 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # 
 with 6,776 more rows
  • The AFINN lexicon assigns words with a score that runs between -5 and 5, with neg‐ ative scores indicating negative sentiment and positive scores indicating positive sen‐ timent.
get_sentiments("afinn")
## # A tibble: 2,477 × 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # 
 with 2,467 more rows
  • Another one in the package
get_sentiments("loughran")
## # A tibble: 4,150 × 2
##    word         sentiment
##    <chr>        <chr>    
##  1 abandon      negative 
##  2 abandoned    negative 
##  3 abandoning   negative 
##  4 abandonment  negative 
##  5 abandonments negative 
##  6 abandons     negative 
##  7 abdicated    negative 
##  8 abdicates    negative 
##  9 abdicating   negative 
## 10 abdication   negative 
## # 
 with 4,140 more rows

Score words difference in female and male abstracts

affword <- get_sentiments("afinn")

affc <- text %>%
  count(id,gender,word, sort = TRUE) %>%
  inner_join(affword, "word")
affc %>% group_by(id, gender) %>%
  summarise(mean.score = mean(value),
            weig.score = weighted.mean(value,n)) %>%
  ggplot(aes(x=gender,y=mean.score)) +
  geom_violin() +
  geom_boxplot(width=0.1) +
  ggtitle("Mean words score per abstract and gender")

  #ggbeeswarm::geom_beeswarm(size=3, shape=21)
affc %>% group_by(id, gender) %>%
  summarise(mean.score = mean(value),
            weig.score = weighted.mean(value,n)) %>%
  ggplot(aes(x=gender,y=weig.score)) +
  geom_violin() +
  geom_boxplot(width=0.1) +
  ggtitle("Weighted mean words score per abstract and gender")

  #ggbeeswarm::geom_beeswarm(size=3, shape=21)

Frequency of sentiment words per abstract

As classificaçÔes das palavras não me parecem muito acuradas com a linguagem científica

nrcword <- get_sentiments("nrc")

nrc <- text %>%
  count(id,gender,word, sort = TRUE) %>%
  inner_join(nrcword, "word") %>%
  group_by(id,gender,sentiment) %>%
  summarise(n= sum(n))

ggplot(nrc, aes(x=gender, y=n)) +
  facet_wrap(~sentiment) +
  geom_violin()

Frequency of sentiment words per abstract

As classificaçÔes das palavras não me parecem muito acuradas com a linguagem científica

bingword <- get_sentiments("bing")

bing <- text %>%
  count(id,gender,word, sort = TRUE) %>%
  inner_join(bingword, "word") %>%
  group_by(id,gender,sentiment) %>%
  summarise(n= sum(n))

ggplot(bing, aes(x=sentiment, y=n)) +
  facet_wrap(~gender) +
  geom_violin()

Frequency of sentiment words per abstract

As classificaçÔes das palavras não me parecem muito acuradas com a linguagem científica

louword <- get_sentiments("loughran")

lou <- text %>%
  count(id,gender,word, sort = TRUE) %>%
  inner_join(louword, "word") %>%
  group_by(id,gender,sentiment) %>%
  summarise(n= sum(n))

ggplot(bing, aes(x=sentiment, y=n)) +
  facet_wrap(~gender) +
  geom_violin()